Load results and normalize results for every amplicon
y <- readResultFile("result.csv",FALSE, ",")
targetNames <- y$Target
y$Target <- NULL
resultFileNames <- c("SN1-17_Run_14.xls", "SN1-21_Run_17.xls","SN1-23-Run_18.xls","SN1-25-Run_19.xls", "SN1-26-Run_20.xls")
for (i in 1:length(resultFileNames)){
res <- readResultFile(resultFileNames[i], FALSE)
namesOfColumns <- names(res)
y[namesOfColumns] <- res
y$Target <- NULL
}
y$Target <- targetNames
y <- y[complete.cases(y[,-1]), ]
y <- y[-53,]
sumsTotal <- rowSums(y[,-ncol(y)])
y[,-ncol(y)] <- log(y[,-ncol(y)])
#sums <- colSums(y[,-ncol(y)])
#y[,-ncol(y)] <- sweep(y[,-ncol(y)], 2, sums, '/' )
sums <- rowSums(y[,-ncol(y)])
y[,-ncol(y)]<- sweep(y[,-ncol(y)], 1, sums, '/' )
y$sum <- sumsTotal
Simple KMeans clustering with a given number of clusters
mydata <- y[, 1:(ncol(y)-2)]
fit <- kmeans(mydata, 4)
#library(cluster)
#clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE,
# labels=2, lines=0)
y$clusterKMeans <- fit$cluster
cat("KMeans given number of clusters: 4\n")
## KMeans given number of clusters: 4
Choose number of clusters using Calinski-Harabasz criterion
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.2-0
fit <- cascadeKM(scale(mydata, center = TRUE, scale = TRUE), 1, 4, iter = 1000)
plot(fit, sortg = TRUE, grpmts.plot = TRUE)
calinski.best <- as.numeric(which.max(fit$results[2,]))
cat("Calinski criterion optimal number of clusters:", calinski.best, "\n")
## Calinski criterion optimal number of clusters: 4
y$clusterCalinski <- fit$partition[,calinski.best]
Choose number of clusters using Affinity propagation
library(apcluster)
##
## Attaching package: 'apcluster'
##
## The following object is masked from 'package:stats':
##
## heatmap
d.apclus <- apcluster(negDistMat(r=2), mydata)
cat("Affinity propagation optimal number of clusters:", length(d.apclus@clusters), "\n")
## Affinity propagation optimal number of clusters: 20
y$clusterAP <- labels(d.apclus, type="enum")
Hierarchical clustering
d_dist <- dist(as.matrix(mydata))
fit <- hclust(d_dist, method="ward")
plot(fit)
groups <- cutree(fit, k=4)
rect.hclust(fit, k=4, border="red")
y$clusterHierarchy <- groups
Hierarchical clustering with correlation matrix
t.mydata <- t(mydata)
colnames(t.mydata) <- y$Target
data.scale<- scale(t.mydata, center=TRUE, scale=TRUE)
cor_data <- cor(data.scale)
cor_d <- as.dist(cor_data)
fit <- hclust(cor_d, method="ward")
plot(fit)
groups <- cutree(fit, k=4)
rect.hclust(fit, k=4, border="red")
y$clusterCor <- groups
Load prepared data
prepared_data <- read.table("../data/features.tsv", sep = "\t", header=TRUE)
rownames(prepared_data) <- prepared_data$Target
rownames(y) <- y$Target
valid_names <- intersect(y$Target, prepared_data$Target)
prepared_data <- prepared_data[valid_names,]
y <- y[valid_names,]
| ### KMeans. Draw features for every cluster |
|---|
| ### Calinski criterion. Draw features for every cluster |
r data$cluster <- as.factor(y$clusterCalinski) histogram(~data$Amplicon_GC_content|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Fwd_Primer_GC_content|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Rev_Primer_GC_content|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Amplicon_Len|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Fwd_Primer_Len|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Rev_Primer_Len|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Amplicon_MaxPolyX|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Fwd_Primer_MaxPolyX|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Rev_Primer_MaxPolyX|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Amplicon_Entropy|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Fwd_Primer_Entropy|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Rev_Primer_Entropy|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Amplicon_Hash|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Fwd_Primer_Hash|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Rev_Primer_Hash|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Fwd_END_3|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Rev_END_3|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Start_Dangling_End|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
r histogram(~data$Stop_Dangling_End|data$cluster, main = "KMeans.Calinski-Harabasz criterion") |
data$cluster <- as.factor(y$clusterHierarchy)
histogram(~data$Amplicon_GC_content|data$cluster, main = "Hierarchical clustering")
histogram(~data$Fwd_Primer_GC_content|data$cluster, main = "Hierarchical clustering")
histogram(~data$Rev_Primer_GC_content|data$cluster, main = "Hierarchical clustering")
histogram(~data$Amplicon_Len|data$cluster, main = "Hierarchical clustering")
histogram(~data$Fwd_Primer_Len|data$cluster, main = "Hierarchical clustering")
histogram(~data$Rev_Primer_Len|data$cluster, main = "Hierarchical clustering")
histogram(~data$Amplicon_MaxPolyX|data$cluster, main = "Hierarchical clustering")
histogram(~data$Fwd_Primer_MaxPolyX|data$cluster, main = "Hierarchical clustering")
histogram(~data$Rev_Primer_MaxPolyX|data$cluster, main = "Hierarchical clustering")
histogram(~data$Amplicon_Entropy|data$cluster, main = "Hierarchical clustering")
histogram(~data$Fwd_Primer_Entropy|data$cluster, main = "Hierarchical clustering")
histogram(~data$Rev_Primer_Entropy|data$cluster, main = "Hierarchical clustering")
histogram(~data$Amplicon_Hash|data$cluster, main = "Hierarchical clustering")
histogram(~data$Fwd_Primer_Hash|data$cluster, main = "Hierarchical clustering")
histogram(~data$Rev_Primer_Hash|data$cluster, main = "Hierarchical clustering")
histogram(~data$Fwd_END_3|data$cluster, main = "Hierarchical clustering")
histogram(~data$Rev_END_3|data$cluster, main = "Hierarchical clustering")
histogram(~data$Start_Dangling_End|data$cluster, main = "Hierarchical clustering")
histogram(~data$Stop_Dangling_End|data$cluster, main = "Hierarchical clustering")
data$cluster <- as.factor(y$clusterCor)
histogram(~data$Amplicon_GC_content|data$cluster, main = "Correlation clustering")
histogram(~data$Fwd_Primer_GC_content|data$cluster, main = "Correlation clustering")
histogram(~data$Rev_Primer_GC_content|data$cluster, main = "Correlation clustering")
histogram(~data$Amplicon_Len|data$cluster, main = "Correlation clustering")
histogram(~data$Fwd_Primer_Len|data$cluster, main = "Correlation clustering")
histogram(~data$Rev_Primer_Len|data$cluster, main = "Correlation clustering")
histogram(~data$Amplicon_MaxPolyX|data$cluster, main = "Correlation clustering")
histogram(~data$Fwd_Primer_MaxPolyX|data$cluster, main = "Correlation clustering")
histogram(~data$Rev_Primer_MaxPolyX|data$cluster, main = "Correlation clustering")
histogram(~data$Amplicon_Entropy|data$cluster, main = "Correlation clustering")
histogram(~data$Fwd_Primer_Entropy|data$cluster, main = "Correlation clustering")
histogram(~data$Rev_Primer_Entropy|data$cluster, main = "Correlation clustering")
histogram(~data$Amplicon_Hash|data$cluster, main = "Correlation clustering")
histogram(~data$Fwd_Primer_Hash|data$cluster, main = "Correlation clustering")
histogram(~data$Rev_Primer_Hash|data$cluster, main = "Correlation clustering")
histogram(~data$Fwd_END_3|data$cluster, main = "Correlation clustering")
histogram(~data$Rev_END_3|data$cluster, main = "Correlation clustering")
histogram(~data$Start_Dangling_End|data$cluster, main = "Correlation clustering")
histogram(~data$Stop_Dangling_End|data$cluster, main = "Correlation clustering")
library(lattice)
data <- prepared_data
data$cluster <- as.factor(y$clusterKMeans)
histogram(data$cluster)
amplicon_gc_content <- getFactoredDataTrunc(data$Amplicon_GC_content)
## 25% 50% 75% 100%
## 33.77483 41.11111 51.26050 70.21277
counts <- table(data$cluster, amplicon_gc_content)
barplot(counts, main="Amplicon Distribution by Amplicon GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_gc_content <- getFactoredDataTrunc(data$Fwd_Primer_GC_content)
## 25% 50% 75% 100%
## 38.46154 43.47826 52.17391 64.70588
counts <- table(data$cluster, fwd_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Fwd Primer GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_gc_content <- getFactoredDataTrunc(data$Rev_Primer_GC_content)
## 25% 50% 75% 100%
## 38.46154 42.30769 47.82609 63.15789
counts <- table(data$cluster, rev_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Rev Primer GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_len <- getFactoredDataTrunc(data$Amplicon_Len)
## 25% 50% 75% 100%
## 114 151 170 188
counts <- table(data$cluster, amplicon_len)
barplot(counts, main="Amplicon Distribution by Amplicon Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_len <- getFactoredDataTrunc(data$Fwd_Primer_Len)
## 25% 50% 75% 100%
## 22 25 28 33
counts <- table(data$cluster, fwd_primer_len)
barplot(counts, main="Amplicon Distribution by Fwd Primer Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_len <- getFactoredDataTrunc(data$Rev_Primer_Len)
## 25% 50% 75% 100%
## 23 26 28 33
counts <- table(data$cluster, rev_primer_len)
barplot(counts, main="Amplicon Distribution by Rev Primer Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_MaxPolyX <- getFactoredDataTrunc(data$Amplicon_MaxPolyX)
## 25% 50% 75% 100%
## 3 4 4 8
counts <- table(data$cluster, amplicon_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Amplicon MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_MaxPolyX <- getFactoredDataTrunc(data$Fwd_Primer_MaxPolyX)
## 25% 50% 75% 100%
## 2 2 3 4
counts <- table(data$cluster, fwd_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Fwd Primer MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_MaxPolyX <- getFactoredDataTrunc(data$Rev_Primer_MaxPolyX)
## 25% 50% 75% 100%
## 2 2 3 4
counts <- table(data$cluster, rev_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Rev Primer MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_Entropy <- getFactoredData(data$Amplicon_Entropy)
## 25% 50% 75% 100%
## 0.3420570 0.3456354 0.3482220 0.3541775
counts <- table(data$cluster, amplicon_Entropy)
barplot(counts, main="Amplicon Distribution by Amplicon Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_Entropy <- getFactoredData(data$Fwd_Primer_Entropy)
## 25% 50% 75% 100%
## 0.3417350 0.3448008 0.3465696 0.3505145
counts <- table(data$cluster, fwd_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Fwd Primer Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_Entropy <- getFactoredData(data$Rev_Primer_Entropy)
## 25% 50% 75% 100%
## 0.3432679 0.3452135 0.3465696 0.3505145
counts <- table(data$cluster, rev_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Rev Primer Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_Hash <- getFactoredDataTrunc(data$Amplicon_Hash)
## 25% 50% 75% 100%
## 18992277214 34196138165 48742458429 62929952416
counts <- table(data$cluster, amplicon_Hash)
barplot(counts, main="Amplicon Distribution by Amplicon Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_Hash <- getFactoredDataTrunc(data$Fwd_Primer_Hash)
## 25% 50% 75% 100%
## 13668885842 26663877899 44900388165 62098312721
counts <- table(data$cluster, fwd_primer_Hash)
barplot(counts, main="Amplicon Distribution by Fwd Primer Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_Hash <- getFactoredDataTrunc(data$Rev_Primer_Hash)
## 25% 50% 75% 100%
## 18803378911 33335803072 48501833706 62915016535
counts <- table(data$cluster, rev_primer_Hash)
barplot(counts, main="Amplicon Distribution by Rev Primer Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
histogram(~data$cluster|data$Fwd_END_3, main = "KMeans")
histogram(~data$cluster|data$Rev_END_3, main = "KMeans")
histogram(~data$cluster|data$Start_Dangling_End, main = "KMeans")
histogram(~data$cluster|data$Stop_Dangling_End, main = "KMeans")
data$cluster <- as.factor(y$clusterCalinski)
histogram(data$cluster)
amplicon_gc_content <- getFactoredDataTrunc(data$Amplicon_GC_content)
## 25% 50% 75% 100%
## 33.77483 41.11111 51.26050 70.21277
counts <- table(data$cluster, amplicon_gc_content)
barplot(counts, main="Amplicon Distribution by Amplicon GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_gc_content <- getFactoredDataTrunc(data$Fwd_Primer_GC_content)
## 25% 50% 75% 100%
## 38.46154 43.47826 52.17391 64.70588
counts <- table(data$cluster, fwd_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Fwd Primer GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_gc_content <- getFactoredDataTrunc(data$Rev_Primer_GC_content)
## 25% 50% 75% 100%
## 38.46154 42.30769 47.82609 63.15789
counts <- table(data$cluster, rev_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Rev Primer GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_len <- getFactoredDataTrunc(data$Amplicon_Len)
## 25% 50% 75% 100%
## 114 151 170 188
counts <- table(data$cluster, amplicon_len)
barplot(counts, main="Amplicon Distribution by Amplicon Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_len <- getFactoredDataTrunc(data$Fwd_Primer_Len)
## 25% 50% 75% 100%
## 22 25 28 33
counts <- table(data$cluster, fwd_primer_len)
barplot(counts, main="Amplicon Distribution by Fwd Primer Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_len <- getFactoredDataTrunc(data$Rev_Primer_Len)
## 25% 50% 75% 100%
## 23 26 28 33
counts <- table(data$cluster, rev_primer_len)
barplot(counts, main="Amplicon Distribution by Rev Primer Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_MaxPolyX <- getFactoredDataTrunc(data$Amplicon_MaxPolyX)
## 25% 50% 75% 100%
## 3 4 4 8
counts <- table(data$cluster, amplicon_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Amplicon MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_MaxPolyX <- getFactoredDataTrunc(data$Fwd_Primer_MaxPolyX)
## 25% 50% 75% 100%
## 2 2 3 4
counts <- table(data$cluster, fwd_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Fwd Primer MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_MaxPolyX <- getFactoredDataTrunc(data$Rev_Primer_MaxPolyX)
## 25% 50% 75% 100%
## 2 2 3 4
counts <- table(data$cluster, rev_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Rev Primer MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_Entropy <- getFactoredData(data$Amplicon_Entropy)
## 25% 50% 75% 100%
## 0.3420570 0.3456354 0.3482220 0.3541775
counts <- table(data$cluster, amplicon_Entropy)
barplot(counts, main="Amplicon Distribution by Amplicon Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_Entropy <- getFactoredData(data$Fwd_Primer_Entropy)
## 25% 50% 75% 100%
## 0.3417350 0.3448008 0.3465696 0.3505145
counts <- table(data$cluster, fwd_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Fwd Primer Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_Entropy <- getFactoredData(data$Rev_Primer_Entropy)
## 25% 50% 75% 100%
## 0.3432679 0.3452135 0.3465696 0.3505145
counts <- table(data$cluster, rev_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Rev Primer Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_Hash <- getFactoredDataTrunc(data$Amplicon_Hash)
## 25% 50% 75% 100%
## 18992277214 34196138165 48742458429 62929952416
counts <- table(data$cluster, amplicon_Hash)
barplot(counts, main="Amplicon Distribution by Amplicon Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_Hash <- getFactoredDataTrunc(data$Fwd_Primer_Hash)
## 25% 50% 75% 100%
## 13668885842 26663877899 44900388165 62098312721
counts <- table(data$cluster, fwd_primer_Hash)
barplot(counts, main="Amplicon Distribution by Fwd Primer Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_Hash <- getFactoredDataTrunc(data$Rev_Primer_Hash)
## 25% 50% 75% 100%
## 18803378911 33335803072 48501833706 62915016535
counts <- table(data$cluster, rev_primer_Hash)
barplot(counts, main="Amplicon Distribution by Rev Primer Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
histogram(~data$cluster|data$Fwd_END_3, main = "KMeans.Calinski-Harabasz criterion")
histogram(~data$cluster|data$Rev_END_3, main = "KMeans.Calinski-Harabasz criterion")
histogram(~data$cluster|data$Start_Dangling_End, main = "KMeans.Calinski-Harabasz criterion")
histogram(~data$cluster|data$Stop_Dangling_End, main = "KMeans.Calinski-Harabasz criterion")
data$cluster <- as.factor(y$clusterHierarchy)
histogram(data$cluster)
amplicon_gc_content <- getFactoredDataTrunc(data$Amplicon_GC_content)
## 25% 50% 75% 100%
## 33.77483 41.11111 51.26050 70.21277
counts <- table(data$cluster, amplicon_gc_content)
barplot(counts, main="Amplicon Distribution by Amplicon GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_gc_content <- getFactoredDataTrunc(data$Fwd_Primer_GC_content)
## 25% 50% 75% 100%
## 38.46154 43.47826 52.17391 64.70588
counts <- table(data$cluster, fwd_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Fwd Primer GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_gc_content <- getFactoredDataTrunc(data$Rev_Primer_GC_content)
## 25% 50% 75% 100%
## 38.46154 42.30769 47.82609 63.15789
counts <- table(data$cluster, rev_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Rev Primer GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_len <- getFactoredDataTrunc(data$Amplicon_Len)
## 25% 50% 75% 100%
## 114 151 170 188
counts <- table(data$cluster, amplicon_len)
barplot(counts, main="Amplicon Distribution by Amplicon Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_len <- getFactoredDataTrunc(data$Fwd_Primer_Len)
## 25% 50% 75% 100%
## 22 25 28 33
counts <- table(data$cluster, fwd_primer_len)
barplot(counts, main="Amplicon Distribution by Fwd Primer Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_len <- getFactoredDataTrunc(data$Rev_Primer_Len)
## 25% 50% 75% 100%
## 23 26 28 33
counts <- table(data$cluster, rev_primer_len)
barplot(counts, main="Amplicon Distribution by Rev Primer Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_MaxPolyX <- getFactoredDataTrunc(data$Amplicon_MaxPolyX)
## 25% 50% 75% 100%
## 3 4 4 8
counts <- table(data$cluster, amplicon_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Amplicon MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_MaxPolyX <- getFactoredDataTrunc(data$Fwd_Primer_MaxPolyX)
## 25% 50% 75% 100%
## 2 2 3 4
counts <- table(data$cluster, fwd_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Fwd Primer MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_MaxPolyX <- getFactoredDataTrunc(data$Rev_Primer_MaxPolyX)
## 25% 50% 75% 100%
## 2 2 3 4
counts <- table(data$cluster, rev_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Rev Primer MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_Entropy <- getFactoredData(data$Amplicon_Entropy)
## 25% 50% 75% 100%
## 0.3420570 0.3456354 0.3482220 0.3541775
counts <- table(data$cluster, amplicon_Entropy)
barplot(counts, main="Amplicon Distribution by Amplicon Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_Entropy <- getFactoredData(data$Fwd_Primer_Entropy)
## 25% 50% 75% 100%
## 0.3417350 0.3448008 0.3465696 0.3505145
counts <- table(data$cluster, fwd_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Fwd Primer Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_Entropy <- getFactoredData(data$Rev_Primer_Entropy)
## 25% 50% 75% 100%
## 0.3432679 0.3452135 0.3465696 0.3505145
counts <- table(data$cluster, rev_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Rev Primer Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_Hash <- getFactoredDataTrunc(data$Amplicon_Hash)
## 25% 50% 75% 100%
## 18992277214 34196138165 48742458429 62929952416
counts <- table(data$cluster, amplicon_Hash)
barplot(counts, main="Amplicon Distribution by Amplicon Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_Hash <- getFactoredDataTrunc(data$Fwd_Primer_Hash)
## 25% 50% 75% 100%
## 13668885842 26663877899 44900388165 62098312721
counts <- table(data$cluster, fwd_primer_Hash)
barplot(counts, main="Amplicon Distribution by Fwd Primer Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_Hash <- getFactoredDataTrunc(data$Rev_Primer_Hash)
## 25% 50% 75% 100%
## 18803378911 33335803072 48501833706 62915016535
counts <- table(data$cluster, rev_primer_Hash)
barplot(counts, main="Amplicon Distribution by Rev Primer Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
histogram(~data$cluster|data$Fwd_END_3, main = "Hierarchical clustering")
histogram(~data$cluster|data$Rev_END_3, main = "Hierarchical clustering")
histogram(~data$cluster|data$Start_Dangling_End, main = "Hierarchical clustering")
histogram(~data$cluster|data$Stop_Dangling_End, main = "Hierarchical clustering")
data$cluster <- as.factor(y$clusterCor)
histogram(data$cluster)
amplicon_gc_content <- getFactoredDataTrunc(data$Amplicon_GC_content)
## 25% 50% 75% 100%
## 33.77483 41.11111 51.26050 70.21277
counts <- table(data$cluster, amplicon_gc_content)
barplot(counts, main="Amplicon Distribution by Amplicon GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_gc_content <- getFactoredDataTrunc(data$Fwd_Primer_GC_content)
## 25% 50% 75% 100%
## 38.46154 43.47826 52.17391 64.70588
counts <- table(data$cluster, fwd_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Fwd Primer GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_gc_content <- getFactoredDataTrunc(data$Rev_Primer_GC_content)
## 25% 50% 75% 100%
## 38.46154 42.30769 47.82609 63.15789
counts <- table(data$cluster, rev_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Rev Primer GC-content and Clusters",
xlab="GC-content", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_len <- getFactoredDataTrunc(data$Amplicon_Len)
## 25% 50% 75% 100%
## 114 151 170 188
counts <- table(data$cluster, amplicon_len)
barplot(counts, main="Amplicon Distribution by Amplicon Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_len <- getFactoredDataTrunc(data$Fwd_Primer_Len)
## 25% 50% 75% 100%
## 22 25 28 33
counts <- table(data$cluster, fwd_primer_len)
barplot(counts, main="Amplicon Distribution by Fwd Primer Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_len <- getFactoredDataTrunc(data$Rev_Primer_Len)
## 25% 50% 75% 100%
## 23 26 28 33
counts <- table(data$cluster, rev_primer_len)
barplot(counts, main="Amplicon Distribution by Rev Primer Length and Clusters",
xlab="Length", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_MaxPolyX <- getFactoredDataTrunc(data$Amplicon_MaxPolyX)
## 25% 50% 75% 100%
## 3 4 4 8
counts <- table(data$cluster, amplicon_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Amplicon MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_MaxPolyX <- getFactoredDataTrunc(data$Fwd_Primer_MaxPolyX)
## 25% 50% 75% 100%
## 2 2 3 4
counts <- table(data$cluster, fwd_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Fwd Primer MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_MaxPolyX <- getFactoredDataTrunc(data$Rev_Primer_MaxPolyX)
## 25% 50% 75% 100%
## 2 2 3 4
counts <- table(data$cluster, rev_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Rev Primer MaxPolyX and Clusters",
xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_Entropy <- getFactoredData(data$Amplicon_Entropy)
## 25% 50% 75% 100%
## 0.3420570 0.3456354 0.3482220 0.3541775
counts <- table(data$cluster, amplicon_Entropy)
barplot(counts, main="Amplicon Distribution by Amplicon Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_Entropy <- getFactoredData(data$Fwd_Primer_Entropy)
## 25% 50% 75% 100%
## 0.3417350 0.3448008 0.3465696 0.3505145
counts <- table(data$cluster, fwd_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Fwd Primer Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_Entropy <- getFactoredData(data$Rev_Primer_Entropy)
## 25% 50% 75% 100%
## 0.3432679 0.3452135 0.3465696 0.3505145
counts <- table(data$cluster, rev_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Rev Primer Entropy and Clusters",
xlab="Entropy", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
amplicon_Hash <- getFactoredDataTrunc(data$Amplicon_Hash)
## 25% 50% 75% 100%
## 18992277214 34196138165 48742458429 62929952416
counts <- table(data$cluster, amplicon_Hash)
barplot(counts, main="Amplicon Distribution by Amplicon Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
fwd_primer_Hash <- getFactoredDataTrunc(data$Fwd_Primer_Hash)
## 25% 50% 75% 100%
## 13668885842 26663877899 44900388165 62098312721
counts <- table(data$cluster, fwd_primer_Hash)
barplot(counts, main="Amplicon Distribution by Fwd Primer Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
rev_primer_Hash <- getFactoredDataTrunc(data$Rev_Primer_Hash)
## 25% 50% 75% 100%
## 18803378911 33335803072 48501833706 62915016535
counts <- table(data$cluster, rev_primer_Hash)
barplot(counts, main="Amplicon Distribution by Rev Primer Hash and Clusters",
xlab="Hash", col=c("darkblue","red","green","blue"),
legend = rownames(counts))
histogram(~data$cluster|data$Fwd_END_3, main = "Correlation clustering")
histogram(~data$cluster|data$Rev_END_3, main = "Correlation clustering")
histogram(~data$cluster|data$Start_Dangling_End, main = "Correlation clustering")
histogram(~data$cluster|data$Stop_Dangling_End, main = "Correlation clustering")